About the data used
The Wine Quality Data Set is a collection of data on wines with 13 attributes and 6497 records when merged. The original data is subdivided into two separate data sets, one for white wine and the other for red wines. These attributes include: fixed acidity, volatile acidity, citric acid, residual sugar, chlorides, free sulfur dioxide, total sulfur dioxide, density, pH, sulphates, alcohol, and quality (score between 0 and 10). a. The source for the dataset can be found here: i. Paulo Cortez, University of Minho, Guimarães, Portugal, http://www3.dsi.uminho.pt/pcortez Cerdeira, F. Almeida, T. Matos and J. Reis, Viticulture Commission of the Vinho Verde Region(CVRVV), Porto, Portugal @2009 ii. Weblink: http://archive.ics.uci.edu/ml/datasets/Wine+Quality
Standardize the data, create dummy variables as needed, and segment for testing and training purposes.
#clear environment
rm(list = ls())
#load data via csv file
my_data<-read.csv("winequality_white_red.csv")
#assign all the correct numeric and factor variables here, set factor levels as needed
str(my_data)
## 'data.frame': 6497 obs. of 13 variables:
## $ Wine_type : Factor w/ 2 levels "red","White": 2 2 2 2 2 2 2 2 2 2 ...
## $ fixed_acidity : num 7 6.3 8.1 7.2 7.2 8.1 6.2 7 6.3 8.1 ...
## $ volatile_acidity : num 0.27 0.3 0.28 0.23 0.23 0.28 0.32 0.27 0.3 0.22 ...
## $ citric_acid : num 0.36 0.34 0.4 0.32 0.32 0.4 0.16 0.36 0.34 0.43 ...
## $ residual_sugar : num 20.7 1.6 6.9 8.5 8.5 6.9 7 20.7 1.6 1.5 ...
## $ chlorides : num 0.045 0.049 0.05 0.058 0.058 0.05 0.045 0.045 0.049 0.044 ...
## $ free_sulfur_dioxide : num 45 14 30 47 47 30 30 45 14 28 ...
## $ total_sulfur_dioxide: num 170 132 97 186 186 97 136 170 132 129 ...
## $ density : num 1.001 0.994 0.995 0.996 0.996 ...
## $ pH : num 3 3.3 3.26 3.19 3.19 3.26 3.18 3 3.3 3.22 ...
## $ sulphates : num 0.45 0.49 0.44 0.4 0.4 0.44 0.47 0.45 0.49 0.45 ...
## $ alcohol : num 8.8 9.5 10.1 9.9 9.9 10.1 9.6 8.8 9.5 11 ...
## $ quality : int 6 6 6 6 6 6 6 6 6 6 ...
#exclude any variables you want by making them characters, conversely , include my making numeric, integer, factor, or recode levels as text to get names to appear as column headers later
my_data$Wine_type<-as.character((my_data$Wine_type))
my_data$quality<-as.factor((my_data$quality))
#get the column names
variable_names<- colnames(my_data)
numeric_columns<-c(ncol(my_data))
factor_list<-list(ncol(my_data))
dat<-c(ncol(my_data))
#scale all the ints
for(i in 1:ncol(my_data)){
if(is.numeric((my_data[,i]))){
numeric_columns[i]<-paste(colnames(my_data[i]),"_scaled", sep = "")
dat[[i]]<-as.data.frame(scale(my_data[,i]))
names(dat[[i]])<-numeric_columns[i]
assign(numeric_columns[i], dat[[i]])
}
}
#create all the dummy variables
library(psych)
#creates a list of all the dummy coded variables
for(i in 1:ncol(my_data)){
if(is.factor( my_data[,i])){
factor_list[[i]]<- as.data.frame(dummy.code(my_data[,i]))
}else{factor_list[[i]]<-"empty"}
}
my_data_cat<-data.frame(matrix(nrow =nrow(my_data), ncol =1))
my_data_cat<- my_data_cat[,-ncol(my_data_cat)]#removes NA column originally creadted
for(i in 1:length(factor_list)){
if(is.data.frame(factor_list[[i]])&& !factor_list[[i]][1]=="empty"){
factor_list[[i]]<-factor_list[[i]][-1]
my_data_cat<-do.call(cbind,list(factor_list[[i]], my_data_cat))
}
if((c("1") %in% (colnames(factor_list[[i]])[1]))|(c("1") %in% (colnames(factor_list[[i]])[1]))) {
colnames(factor_list[[i]])<-variable_names[i]
}
if(is.data.frame(factor_list[[i]][1]=="empty")){
factor_list[[i]]<-as.data.frame(factor_list[[i]])
names(factor_list[[i]])<-names(my_data[,i])
}
}
#recombine the data inot one dataset
numeric_data<- sapply(.GlobalEnv, is.data.frame)
my_data_all<-do.call(cbind, mget(names(numeric_data)[numeric_data]))
#refine for analysis
#remove any dataset with "my_data." as column header
remove_old_columns<- colnames(my_data_all)
my_data_all_2<-my_data_all[, !grepl(glob2rx("my_data.*"), names(my_data_all))]
write.csv(my_data_all, "my_data_all.csv")
write.csv(my_data_all_2,"my_data_all_2.csv")
#add back in the Y factor
my_data_all_3<-cbind(my_data_all_2,my_data$quality, my_data$Wine_type)
write.csv(my_data_all_3, "my_data_all_3.csv")
set.seed(1)
my_data_all_3<-my_data_all_3[sample(nrow(my_data_all_3)),]
#Create 10 equally size folds
folds <- as.data.frame(cut(seq(1,nrow(my_data_all_3)),breaks=10,labels=FALSE))
my_data_all_3<-cbind(folds,my_data_all_3)
#save segmented portions to drive
for (i in 1:10){
assign(paste("set",i, sep =""), my_data_all_3[my_data_all_3$`cut(seq(1, nrow(my_data_all_3)), breaks = 10, labels = FALSE)`==i,])
write.csv(eval(parse(text= paste("set",i,sep=""))), paste("set",i,".csv", sep = ""))
head(my_data_all_3)
}
General overview of data pre processing. There are significantly more records of white wines than red.
library(ggplot2)
library(plotly)
wine_data_type_summary_plot<- ggplot(my_data, aes(my_data$Wine_type, color = my_data$Wine_type, fill = my_data$Wine_type)) + geom_bar()
ggplotly(wine_data_type_summary_plot)
###Determine the best predictors for wine type
set.seed(11)
#conduct all testing on set2-set10
my_train_data<-rbind(set2, set3, set4, set5, set6, set7, set8, set9, set10)
paste("proportion of training data:", "~",round(100*nrow(my_train_data)/nrow(my_data)), "%")
## [1] "proportion of training data: ~ 90 %"
library(ggfortify)
library(ggplot2)
library(plotly)
library(glmnet)
cross_validated_fit <- cv.glmnet(x = as.matrix(my_train_data[,c(2:18)]), y = as.matrix(my_train_data[,ncol(my_train_data)]), family = "binomial", type.measure = "class", nfolds = 10, nlambda = 1000, alpha = 1)
ggplotly(autoplot(cross_validated_fit, color = "orange"))
ggplotly(autoplot(cross_validated_fit$glmnet.fit, "lambda", label = TRUE))
Based on crossvalidated plot of misclassification error and log (lambda) using a lambda 1 sd from min should be ok. Though it should be noted that one could use a log(lambda) of between -4 and -2 without a significant increase in misclassificaton error (still <.05) but with the added benefit of a more parsimonious model (needing ~6 variables vs 11 with recommended lambda).
Consequently, we will consider two models, one with the lowest misclassification levels and the other with highest level of parsimony.
cross_validated_fit$lambda.1se
## [1] 0.002579141
lasso_model1<-glmnet(x = as.matrix(my_train_data[,c(2:18)]), y = as.matrix(my_train_data[,ncol(my_train_data)]), family = "binomial", lambda = cross_validated_fit$lambda.1se, alpha = 1)
lasso_model1$beta
## 17 x 1 sparse Matrix of class "dgCMatrix"
## s0
## my_data_cat.4 1.56360580
## my_data_cat.5 .
## my_data_cat.6 .
## my_data_cat.7 .
## my_data_cat.8 .
## my_data_cat.9 .
## pH_scaled -0.32870198
## density_scaled -2.31318165
## alcohol_scaled -0.44769990
## sulphates_scaled -0.66392192
## residual_sugar_scaled 1.75575020
## citric_acid_scaled .
## free_sulfur_dioxide_scaled .
## total_sulfur_dioxide_scaled 2.21239836
## volatile_acidity_scaled -1.28172516
## fixed_acidity_scaled -0.08478189
## chlorides_scaled -0.54835257
#run predictions against actual test case
lasso_model1_train_prediction<-predict.glmnet(lasso_model1, newx = as.matrix(my_train_data[2:18]), s = cross_validated_fit$lambda.1se, type = "response" )
#convert results to probabilities via inverse logit
lasso_model1_train_prediction_probs<- exp(lasso_model1_train_prediction)/(1+exp(lasso_model1_train_prediction))
my_train_data_updated<-as.data.frame(cbind(my_train_data, lasso_model1_train_prediction_probs))
my_train_data_updated$predictions_logistic_model1<-ifelse(my_train_data_updated$`1` >=.5,"White","red")
my_confusion_table_train_class_red<-table(my_train_data_updated$`my_data$Wine_type`, my_train_data_updated$predictions_logistic_model1)
my_confusion_table_train_class_red
##
## red White
## red 1406 22
## White 13 4406
#columns= predictions
#rows = reference
#run predictions against actual test case
lasso_model1_test_prediction<-predict.glmnet(lasso_model1, newx = as.matrix(set1[,2:18]), s = cross_validated_fit$lambda.1se, type = "response" )
#convert results to probabilities via inverse logit
lasso_model1_test_prediction_probs<- exp(lasso_model1_test_prediction)/(1+exp(lasso_model1_test_prediction))
set1_updated<-as.data.frame(cbind(set1, lasso_model1_test_prediction_probs))
set1_updated$predictions_logistic_model1<-ifelse(set1_updated$`1`>=.5,"White","red")
my_confusion_table_test_class_red<-table(set1_updated$`my_data$Wine_type`, set1_updated$predictions_logistic_model1)
my_confusion_table_test_class_red
##
## red White
## red 170 1
## White 2 477
#columns= predictions
#rows = reference
TP_train_red<-my_confusion_table_train_class_red[1,1]
FN_train_red<-my_confusion_table_train_class_red[1,2]
FP_train_red<-my_confusion_table_train_class_red[2,1]
TN_train_red<-my_confusion_table_train_class_red[2,2]
my_confusion_table_precision_train_class_red <- TP_train_red/(TP_train_red + FP_train_red)
my_confusion_table_recall_train_class_red <- TP_train_red / (TP_train_red + FN_train_red)
my_confusion_table_F1score_train_class_red <-2*(my_confusion_table_precision_train_class_red*my_confusion_table_recall_train_class_red/(my_confusion_table_precision_train_class_red + my_confusion_table_recall_train_class_red))
paste(
"precision_train_class_red = ", my_confusion_table_precision_train_class_red,
"recall_train_class_red = ", my_confusion_table_recall_train_class_red,
"F1_score_train_class_red = ",my_confusion_table_F1score_train_class_red
)
## [1] "precision_train_class_red = 0.990838618745596 recall_train_class_red = 0.984593837535014 F1_score_train_class_red = 0.987706357569371"
TP_test_red<-my_confusion_table_test_class_red[1,1]
FN_test_red<-my_confusion_table_test_class_red[1,2]
FP_test_red<-my_confusion_table_test_class_red[2,1]
TN_test_red<-my_confusion_table_test_class_red[2,2]
my_confusion_table_precision_test_class_red <- TP_test_red/(TP_test_red + FP_test_red)
my_confusion_table_recall_test_class_red <- TP_test_red / (TP_test_red + FN_test_red)
my_confusion_table_F1score_test_class_red <-2*(my_confusion_table_precision_test_class_red*my_confusion_table_recall_test_class_red/(my_confusion_table_precision_test_class_red + my_confusion_table_recall_test_class_red))
paste(
"precision_test_class_red = ", my_confusion_table_precision_test_class_red,
"recall_test_class_red = ", my_confusion_table_recall_test_class_red,
"F1_score_test_class_red = ",my_confusion_table_F1score_test_class_red
)
## [1] "precision_test_class_red = 0.988372093023256 recall_test_class_red = 0.994152046783626 F1_score_test_class_red = 0.991253644314869"
my_confusion_table_train_class_White<-as.data.frame(rbind(c(my_confusion_table_train_class_red[2,2],
my_confusion_table_train_class_red[2,1]),
c(my_confusion_table_train_class_red[1,2],my_confusion_table_train_class_red[1,1])))
colnames(my_confusion_table_train_class_White)<-c("White","red")
rownames(my_confusion_table_train_class_White) <- c("White","red")
my_confusion_table_train_class_White
TP_train_White<-my_confusion_table_train_class_White[1,1]
FN_train_White<-my_confusion_table_train_class_White[1,2]
FP_train_White<-my_confusion_table_train_class_White[2,1]
TN_train_White<-my_confusion_table_train_class_White[2,2]
my_confusion_table_precision_train_class_White <- TP_train_White/(TP_train_White + FP_train_White)
my_confusion_table_recall_train_class_White <- TP_train_White / (TP_train_White + FN_train_White)
my_confusion_table_F1score_train_class_White <- 2*(my_confusion_table_precision_train_class_White*my_confusion_table_recall_train_class_White/(my_confusion_table_precision_train_class_White + my_confusion_table_recall_train_class_White))
paste(
"precision_train_class_White = ", my_confusion_table_precision_train_class_White,
"recall_train_class_White = ", my_confusion_table_recall_train_class_White,
"F1_score_train_class_White = ",my_confusion_table_F1score_train_class_White
)
## [1] "precision_train_class_White = 0.995031616982836 recall_train_class_White = 0.997058157954288 F1_score_train_class_White = 0.996043856674579"
cross transpose two way table class red table for class white
my_confusion_table_test_class_White<-as.data.frame(rbind(c(my_confusion_table_test_class_red[2,2],
my_confusion_table_test_class_red[2,1]),
c(my_confusion_table_test_class_red[1,2],my_confusion_table_test_class_red[1,1])))
colnames(my_confusion_table_test_class_White)<-c("White","red")
rownames(my_confusion_table_test_class_White) <- c("White","red")
my_confusion_table_test_class_White
TP_test_White<-my_confusion_table_test_class_White[1,1]
FN_test_White<-my_confusion_table_test_class_White[1,2]
FP_test_White<-my_confusion_table_test_class_White[2,1]
TN_test_White<-my_confusion_table_test_class_White[2,2]
my_confusion_table_precision_test_class_White <- TP_test_White/(TP_test_White + FP_test_White)
my_confusion_table_recall_test_class_White <- TP_test_White / (TP_test_White + FN_test_White)
my_confusion_table_F1score_test_class_White<-2*(my_confusion_table_precision_test_class_White*my_confusion_table_recall_test_class_White/(my_confusion_table_precision_test_class_White + my_confusion_table_recall_test_class_White))
paste(
"precision_test_class_White = ", my_confusion_table_precision_test_class_White,
"recall_test_class_White = ", my_confusion_table_recall_test_class_White,
"F1_score_test_class_White = ",my_confusion_table_F1score_test_class_White
)
## [1] "precision_test_class_White = 0.997907949790795 recall_test_class_White = 0.995824634655532 F1_score_test_class_White = 0.996865203761755"
my_confusion_table_accuracy_train_class_red <- (TP_train_red + TN_train_red) /(TP_train_red + TN_train_red + FP_train_red + FN_train_red)
my_confusion_table_accuracy_train_class_red#should be the same if based of table for class White
## [1] 0.994014
my_confusion_table_accuracy_test_class_red <- (TP_test_red + TN_test_red) /(TP_test_red + TN_test_red + FP_test_red + FN_test_red)
my_confusion_table_accuracy_test_class_red#should be the same if based of table for class White
## [1] 0.9953846
library(ROCR)
my_train_data_updated$`my_data$Wine_type`<-ifelse(my_train_data_updated$`my_data$Wine_type`=="White","1","0")
my_train_data_updated$predictions_logistic_model1<-ifelse(my_train_data_updated$predictions_logistic_model1=="White","1","0")
pred_train<- prediction(as.matrix(as.numeric(my_train_data_updated[,(ncol(my_train_data_updated)-2)])), as.matrix(as.numeric(my_train_data_updated[,ncol(my_train_data_updated)])))
perf_train<-performance(pred_train, "tpr", "fpr")
plot(perf_train)
auc_train<-attr(performance(pred_train, "auc"),"y.values")
auc_train
## [[1]]
## [1] 0.9929351
library(ROCR)
set1_updated$`my_data$Wine_type`<-ifelse(set1_updated$`my_data$Wine_type`=="White","1","0")
set1_updated$predictions_logistic_model1<-ifelse(set1_updated$predictions_logistic_model1=="White","1","0")
pred_test<- prediction(as.matrix(as.numeric(set1_updated[,(ncol(set1_updated)-2)])), as.matrix(as.numeric(set1_updated[,ncol(set1_updated)])))
perf_test<-performance(pred_test, "tpr", "fpr")
plot(perf_test)
auc_test<-attr(performance(pred_test, "auc"),"y.values")
auc_test
## [[1]]
## [1] 0.99314
y<-my_train_data[,ncol(my_train_data)]
lasso_model2_no_coefs<-glm(formula = y~ my_train_data$my_data_cat.4 +
my_train_data$pH_scaled +
my_train_data$fixed_acidity_scaled +
my_train_data$chlorides_scaled +
my_train_data$residual_sugar_scaled +
my_train_data$density_scaled +
my_train_data$total_sulfur_dioxide_scaled +
my_train_data$volatile_acidity_scaled +
my_train_data$alcohol_scaled +
my_train_data$sulphates_scaled,
family = "binomial")
summary(lasso_model2_no_coefs)
##
## Call:
## glm(formula = y ~ my_train_data$my_data_cat.4 + my_train_data$pH_scaled +
## my_train_data$fixed_acidity_scaled + my_train_data$chlorides_scaled +
## my_train_data$residual_sugar_scaled + my_train_data$density_scaled +
## my_train_data$total_sulfur_dioxide_scaled + my_train_data$volatile_acidity_scaled +
## my_train_data$alcohol_scaled + my_train_data$sulphates_scaled,
## family = "binomial")
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.3035 0.0017 0.0182 0.0563 6.9780
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) 4.0628 0.2790 14.563
## my_train_data$my_data_cat.4 2.4757 0.5828 4.248
## my_train_data$pH_scaled 0.1028 0.2237 0.460
## my_train_data$fixed_acidity_scaled 0.5684 0.2941 1.933
## my_train_data$chlorides_scaled -0.6789 0.1426 -4.761
## my_train_data$residual_sugar_scaled 4.0721 0.4528 8.994
## my_train_data$density_scaled -5.0360 0.5585 -9.017
## my_train_data$total_sulfur_dioxide_scaled 2.3960 0.2127 11.264
## my_train_data$volatile_acidity_scaled -1.3447 0.1622 -8.289
## my_train_data$alcohol_scaled -2.0039 0.3249 -6.168
## my_train_data$sulphates_scaled -0.4251 0.1808 -2.351
## Pr(>|z|)
## (Intercept) < 2e-16 ***
## my_train_data$my_data_cat.4 2.16e-05 ***
## my_train_data$pH_scaled 0.6458
## my_train_data$fixed_acidity_scaled 0.0533 .
## my_train_data$chlorides_scaled 1.93e-06 ***
## my_train_data$residual_sugar_scaled < 2e-16 ***
## my_train_data$density_scaled < 2e-16 ***
## my_train_data$total_sulfur_dioxide_scaled < 2e-16 ***
## my_train_data$volatile_acidity_scaled < 2e-16 ***
## my_train_data$alcohol_scaled 6.91e-10 ***
## my_train_data$sulphates_scaled 0.0187 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6500.75 on 5846 degrees of freedom
## Residual deviance: 407.05 on 5836 degrees of freedom
## AIC: 429.05
##
## Number of Fisher Scoring iterations: 9
lasso_model2_no_coefs_prediction<-predict(lasso_model2_no_coefs, type= "response")
lasso_model2_no_coefs_prediction_classes<-ifelse(lasso_model2_no_coefs_prediction>=.5,"White","red")
my_confusion_table_train_no_coefs_class_red <-table(cbind(as.data.frame(my_train_data$`my_data$Wine_type`), lasso_model2_no_coefs_prediction_classes))
my_confusion_table_train_no_coefs_class_red
## lasso_model2_no_coefs_prediction_classes
## my_train_data$`my_data$Wine_type` red White
## red 1412 16
## White 14 4405
my_confusion_table_train_class_red
##
## red White
## red 1406 22
## White 13 4406
TP_train_red<-my_confusion_table_train_no_coefs_class_red[1,1]
FN_train_red<-my_confusion_table_train_no_coefs_class_red[1,2]
FP_train_red<-my_confusion_table_train_no_coefs_class_red[2,1]
TN_train_red<-my_confusion_table_train_no_coefs_class_red[2,2]
my_confusion_table_precision_train_no_coefs_class_red <- TP_train_red/(TP_train_red + FP_train_red)
my_confusion_table_recall_train_no_coefs_class_red <- TP_train_red / (TP_train_red + FN_train_red)
my_confusion_table_F1score_train_no_coefs_class_red<-2*(my_confusion_table_precision_train_no_coefs_class_red*my_confusion_table_recall_train_no_coefs_class_red/(my_confusion_table_precision_train_no_coefs_class_red + my_confusion_table_recall_train_no_coefs_class_red))
paste(
"precision_train_no_coefs_class_red = ", my_confusion_table_precision_train_no_coefs_class_red,
"recall_train_no_coefs_class_red = ", my_confusion_table_recall_train_no_coefs_class_red,
"F1_score_train_no_coefs_class_red = ",my_confusion_table_F1score_train_no_coefs_class_red
)
## [1] "precision_train_no_coefs_class_red = 0.990182328190743 recall_train_no_coefs_class_red = 0.988795518207283 F1_score_train_no_coefs_class_red = 0.989488437281009"
my_confusion_table_train_no_coefs_class_White<-as.data.frame(rbind(c(my_confusion_table_train_no_coefs_class_red[2,2],
my_confusion_table_train_no_coefs_class_red[2,1]),
c(my_confusion_table_train_no_coefs_class_red[1,2],my_confusion_table_train_no_coefs_class_red[1,1])))
colnames(my_confusion_table_train_no_coefs_class_White)<-c("White","red")
rownames(my_confusion_table_train_no_coefs_class_White) <- c("White","red")
my_confusion_table_train_no_coefs_class_White
TP_train_White<-my_confusion_table_train_no_coefs_class_White[1,1]
FN_train_White<-my_confusion_table_train_no_coefs_class_White[1,2]
FP_train_White<-my_confusion_table_train_no_coefs_class_White[2,1]
TN_train_White<-my_confusion_table_train_no_coefs_class_White[2,2]
my_confusion_table_precision_train_no_coefs_class_White <- TP_train_White/(TP_train_White + FP_train_White)
my_confusion_table_recall_train_no_coefs_class_White <- TP_train_White / (TP_train_White + FN_train_White)
my_confusion_table_F1score_train_no_coefs_class_White<-2*(my_confusion_table_precision_train_no_coefs_class_White*my_confusion_table_recall_train_no_coefs_class_White/(my_confusion_table_precision_train_no_coefs_class_White + my_confusion_table_recall_train_no_coefs_class_White))
paste(
"precision_train_no_coefs_class_White = ", my_confusion_table_precision_train_no_coefs_class_White,
"recall_train_no_coefs_class_White = ", my_confusion_table_recall_train_no_coefs_class_White,
"F1_score_train_no_coefs_class_White = ",my_confusion_table_F1score_train_no_coefs_class_White
)
## [1] "precision_train_no_coefs_class_White = 0.996380909296539 recall_train_no_coefs_class_White = 0.99683186241231 F1_score_train_no_coefs_class_White = 0.996606334841629"
library(e1071)
set.seed(1)
m.svm_radial_train<- svm(my_train_data_updated[,2:19], as.factor(my_train_data_updated[,20]), cross = 10, scale = FALSE, kernel = "radial", probability = TRUE)
summary(m.svm_radial_train)
##
## Call:
## svm.default(x = my_train_data_updated[, 2:19], y = as.factor(my_train_data_updated[,
## 20]), scale = FALSE, kernel = "radial", cross = 10, probability = TRUE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
## gamma: 0.05555556
##
## Number of Support Vectors: 272
##
## ( 145 127 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
##
## 10-fold cross-validation on training data:
##
## Total Accuracy: 99.60664
## Single Accuracies:
## 99.14384 99.65812 99.65812 99.82877 99.65812 99.82906 99.4863 99.48718 99.82906 99.48718
m.svm_linear_train<- svm(my_train_data_updated[,2:19], as.factor(my_train_data_updated[,20]), cross = 10, scale = FALSE, kernel = "linear", probability = TRUE)
summary(m.svm_linear_train)
##
## Call:
## svm.default(x = my_train_data_updated[, 2:19], y = as.factor(my_train_data_updated[,
## 20]), scale = FALSE, kernel = "linear", cross = 10, probability = TRUE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1
## gamma: 0.05555556
##
## Number of Support Vectors: 142
##
## ( 73 69 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
##
## 10-fold cross-validation on training data:
##
## Total Accuracy: 99.46981
## Single Accuracies:
## 99.14384 99.82906 99.1453 99.65753 99.82906 99.31624 99.31507 99.1453 99.31624 100
m.svm_linear_train_fitted<-as.data.frame(as.integer(as.character(m.svm_linear_train$fitted)))
m.svm_radial_train_fitted<-as.data.frame(as.integer(as.character(m.svm_radial_train$fitted)))
my_train_data_updated$predictions_logistic_model1<-as.integer(as.character(my_train_data_updated$predictions_logistic_model1))
my_train_data_updated<-cbind(my_train_data_updated,m.svm_linear_train_fitted, m.svm_radial_train_fitted)
my_train_data_updated$majority_vote<-my_train_data_updated$predictions_logistic_model1+
my_train_data_updated$`as.integer(as.character(m.svm_radial_train$fitted))`+
my_train_data_updated$`as.integer(as.character(m.svm_linear_train$fitted))`
my_train_data_updated$majority_vote<-ifelse(my_train_data_updated$majority_vote>=2,1,0)
my_confusion_table_train_ensemble1_class_red<-table(my_train_data_updated$`my_data$Wine_type`, my_train_data_updated$majority_vote)
my_confusion_table_train_ensemble1_class_red
##
## 0 1
## 0 1411 17
## 1 9 4410
TP_train_red<-my_confusion_table_train_ensemble1_class_red[1,1]
FN_train_red<-my_confusion_table_train_ensemble1_class_red[1,2]
FP_train_red<-my_confusion_table_train_ensemble1_class_red[2,1]
TN_train_red<-my_confusion_table_train_ensemble1_class_red[2,2]
my_confusion_table_precision_train_ensemble1_class_red <- TP_train_red/(TP_train_red + FP_train_red)
my_confusion_table_recall_train_ensemble1_class_red <- TP_train_red / (TP_train_red + FN_train_red)
my_confusion_table_F1score_train_ensemble1_class_red<-2*(my_confusion_table_precision_train_ensemble1_class_red*my_confusion_table_recall_train_ensemble1_class_red/(my_confusion_table_precision_train_ensemble1_class_red + my_confusion_table_recall_train_ensemble1_class_red))
paste(
"precision_train_ensemble1_class_red = ", my_confusion_table_precision_train_ensemble1_class_red,
"recall_train_ensemble1_class_red = ", my_confusion_table_recall_train_ensemble1_class_red,
"F1_score_train_ensemble1_class_red = ",my_confusion_table_F1score_train_ensemble1_class_red
)
## [1] "precision_train_ensemble1_class_red = 0.993661971830986 recall_train_ensemble1_class_red = 0.988095238095238 F1_score_train_ensemble1_class_red = 0.990870786516854"
my_confusion_table_train_ensemble1_class_White<-as.data.frame(rbind(c(my_confusion_table_train_ensemble1_class_red[2,2],
my_confusion_table_train_ensemble1_class_red[2,1]),
c(my_confusion_table_train_ensemble1_class_red[1,2],my_confusion_table_train_ensemble1_class_red[1,1])))
colnames(my_confusion_table_train_ensemble1_class_White)<-c("White","red")
rownames(my_confusion_table_train_ensemble1_class_White) <- c("White","red")
my_confusion_table_train_ensemble1_class_White
TP_train_White<-my_confusion_table_train_ensemble1_class_White[1,1]
FN_train_White<-my_confusion_table_train_ensemble1_class_White[1,2]
FP_train_White<-my_confusion_table_train_ensemble1_class_White[2,1]
TN_train_White<-my_confusion_table_train_ensemble1_class_White[2,2]
my_confusion_table_precision_train_ensemble1_class_White <- TP_train_White/(TP_train_White + FP_train_White)
my_confusion_table_recall_train_ensemble1_class_White<- TP_train_White / (TP_train_White + FN_train_White)
my_confusion_table_F1score_train_ensemble1_class_White<-2*(my_confusion_table_precision_train_ensemble1_class_red*my_confusion_table_recall_train_ensemble1_class_White/(my_confusion_table_precision_train_ensemble1_class_White + my_confusion_table_recall_train_ensemble1_class_White))
paste(
"precision_train_ensemble1_class_White = ", my_confusion_table_precision_train_ensemble1_class_White,
"recall_train_ensemble1_class_White = ", my_confusion_table_recall_train_ensemble1_class_White,
"F1_score_train_ensemble1_class_White = ",my_confusion_table_F1score_train_ensemble1_class_White
)
## [1] "precision_train_ensemble1_class_White = 0.996159927716286 recall_train_ensemble1_class_White = 0.9979633401222 F1_score_train_ensemble1_class_White = 0.994560603503453"
my_train_data_updated[which(my_train_data_updated$`as.integer(as.character(m.svm_linear_train$fitted))`!=my_train_data_updated$`as.integer(as.character(m.svm_radial_train$fitted))`),]